sunflowerIn this vignette, we present a practical example of using the sunflower package to work with datasets that include a column of responses containing multiple answers. We demonstrate how to convert the dataset into a long format to obtain formal similarity metrics. Additionally, we illustrate how to perform error classification based on classical criteria found in the literature (e.g., Dell et al., 1997; Gold & Kertesz, 2001; see also, García-Orza et al., 2020).
require(sunflower) # to work
#> Loading required package: sunflower
require(tidyverse) # to work along sunflower
#> Loading required package: tidyverse
#> Warning: package 'tidyverse' was built under R version 4.2.3
#> Warning: package 'ggplot2' was built under R version 4.2.3
#> Warning: package 'tibble' was built under R version 4.2.3
#> Warning: package 'tidyr' was built under R version 4.2.3
#> Warning: package 'readr' was built under R version 4.2.3
#> Warning: package 'purrr' was built under R version 4.2.3
#> Warning: package 'dplyr' was built under R version 4.2.3
#> Warning: package 'stringr' was built under R version 4.2.3
#> Warning: package 'forcats' was built under R version 4.2.3
#> Warning: package 'lubridate' was built under R version 4.2.3
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr 1.1.4 ✔ readr 2.1.5
#> ✔ forcats 1.0.0 ✔ stringr 1.5.1
#> ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
#> ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
#> ✔ purrr 1.0.2
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
require(knitr) # to work in Rmarkdown
#> Loading required package: knitr
#> Warning: package 'knitr' was built under R version 4.2.3
require(kableExtra) # to work in Rmarkdown
#> Loading required package: kableExtra
#> Warning: package 'kableExtra' was built under R version 4.2.3
#>
#> Attaching package: 'kableExtra'
#>
#> The following object is masked from 'package:dplyr':
#>
#> group_rows
require(rmarkdown) # to work in Rmarkdown
#> Loading required package: rmarkdown
#> Warning: package 'rmarkdown' was built under R version 4.2.3
require(ggplot2) #to plot
Load the data IGC allocated in the package, select some
columns to keep and visualize it.
IGC <- readxl::read_xlsx("data/1-Gutiérrez-Cordero_data_RAW.xlsx") %>%
dplyr::select(test, task_ID, task_type, ID, item_ID = task_item_ID, item, final_response, correct) %>%
dplyr::filter(test %in% c("SnodgrassVanderwart", "BETA", "EPLA", "Gutiérrez-Cordero")) %>%
dplyr::filter(!str_detect(task_type, "nonword")) %>%
dplyr::arrange(ID)
View(IGC)
rmarkdown::paged_table(IGC %>% dplyr::select(c(item_ID, item, final_response)), options = list(rows.print = 8, align = "ccc"))
# Convertir el dataframe en un objeto tipo tabla usando tableGrob
table_plot <- gridExtra::tableGrob(IGC %>% dplyr::select(c(ID, item, final_response)) %>%
dplyr::slice_head(n = 11))
# Crear un gráfico vacío para agregar la tabla, eliminando los márgenes
plot <- ggplot() +
theme_void() + # Eliminar todos los elementos gráficos
annotation_custom(table_plot)
# Mostrar el gráfico
plot
ggsave("artwork/figure1.png", plot, width = 5.5, height = 3.75, dpi = 600)
ggsave("artwork/figure1.svg", plot, width = 5.5, height = 3.75, dpi = 600)
Separate the data using the separate_responses()
function and them rearrange as long format using the
join_responses() to work in the following step.
IGC_step1 = IGC %>% separate_responses(col_name = "final_response",
separate_with = ", ") %>%
get_attempts(first_production = Attempt_1, drop_blank_spaces = T)
IGC_step1_skinnydf = IGC_step1 %>% dplyr::select(-c(task_type))
rmarkdown::paged_table(IGC_step1_skinnydf, options = list(rows.print = 25, align = "ccc"))
# Convertir el dataframe en un objeto tipo tabla usando tableGrob
table_plot2 <- gridExtra::tableGrob(IGC_step1_skinnydf %>% dplyr::select(c(ID, item, attempt = Attempt, response = Response)) %>%
dplyr::slice_head(n = 15))
# Crear un gráfico vacío para agregar la tabla, eliminando los márgenes
plot2 <- ggplot2::ggplot() +
ggplot2::theme_void() + # Eliminar todos los elementos gráficos
ggplot2::annotation_custom(table_plot2)
# Mostrar el gráfico
plot2
ggsave("artwork/figure2.png", plot2, width = 4.25, height = 4.75, dpi = 600)
ggsave("artwork/figure2.svg", plot2, width = 4.25, height = 4.75, dpi = 600)
Compute the similarity metrics using the
get_formal_similarity() function.
IGC_step2 = IGC_step1 %>% get_formal_similarity(item_col = "item",
response_col = "Response",
attempt_col = "Attempt",
group_cols = c("ID", "task_ID"))
#> The function get_formal_similarity() took 3.73 seconds to be executed
# Define terms to remove
terms_to_remove <- c("el", "la", "un", "una", "vaya",
"los", "las", "unos", "unas", "no")
# Create a pattern to match the terms
pattern <- paste0("\\b(", paste(terms_to_remove, collapse = "|"), ")\\b")
# Filter the DataFrame to exclude rows containing the terms in 'Response'
IGC_step2_skinnydf <- IGC_step2 %>%
dplyr::filter(!str_detect(Response, pattern))
IGC_step2_skinnydf = IGC_step2 %>% dplyr::select(-c(task_type))
rmarkdown::paged_table(IGC_step2_skinnydf, options = list(rows.print = 25, align = "c"))
# Convertir el dataframe en un objeto tipo tabla usando tableGrob
table_plot3 <- gridExtra::tableGrob(
IGC_step2_skinnydf %>% dplyr::select(-c(responseL, targetL)) %>%
dplyr::mutate(across(where(is.numeric), ~ round(., 3))) %>%
dplyr::select(-comment_warning)%>%
#dplyr::select(c(item_ID, item, Response, RA, Attempt, shared1char:DLd, JWd, pcc, CdA_diff = approach_diff)) %>%
dplyr::rename(diff_chars = diff_char_num, attempt = Attempt, response = Response) %>%
dplyr::slice_head(n = 15) # Seleccionar las primeras 7 filas
)
# Crear un gráfico vacío para agregar la tabla, eliminando los márgenes
plot3 <- ggplot() +
theme_void() + # Eliminar todos los elementos gráficos
annotation_custom(table_plot3)
# Mostrar el gráfico
plot3
ggsave("artwork/figure4.png", plot3, width = 21, height = 5, dpi = 600)
ggsave("artwork/figure4.svg", plot3, width = 21, height = 5, dpi = 600)
Obtain the correct characters, in this case, letters, in their
correct position using the positional_accuracy()
function.
IGC_step2.1 = IGC_step2 %>% positional_accuracy(match_col = "itemL_adj_strict_match_pos",
last_ID_col = "targetL")
#> New names:
#> • `` -> `...1`
#> • `` -> `...2`
#> • `` -> `...3`
#> • `` -> `...4`
#> • `` -> `...5`
#> • `` -> `...6`
#> • `` -> `...7`
#> • `` -> `...8`
#> • `` -> `...9`
#> • `` -> `...10`
#> • `` -> `...11`
#> • `` -> `...12`
#> • `` -> `...13`
#> • `` -> `...14`
IGC_step2.1_skinnydf = IGC_step2.1 %>% dplyr::select(-c(task_ID, correct, task_type)) %>%
dplyr::rename(attempt = Attempt, response = Response, position = Position)
rmarkdown::paged_table(IGC_step2.1_skinnydf, options = list(rows.print = 25, align = "c"))
This is a file (generated following the procedure described by [Dueñas Lerín] (https://duenaslerin.com/diccionario-palabras-espanol-en-texto-script/)) containing all the words in Spanish as available in the RAE dictionary. It can be downloaded from the author’s page at https://github.com/JorgeDuenasLerin/diccionario-espanol-txt.
m_w2v = word2vec::read.word2vec(file = file.choose(), normalize = F)
This is a file (generated using the word2vec algorithm by Cardellino) containing the embeddings of 1.5 billion words. It can be downloaded from the author’s page or in another mirror at Github, where others corpuses can be assessed; further details provided by the author here
IGC_step2_clean = IGC %>%
separate_responses(
col_name = "final_response",
separate_with = ", ") %>%
get_attempts(
first_production = Attempt_1, drop_blank_spaces = T) %>%
dplyr::select(task_ID, ID, item_ID, task_type, item, Response, RA, Attempt) %>%
get_formal_similarity(item_col = "item", response_col = "Response",
attempt_col = "Attempt",
group_cols = c("ID", "item_ID"))
#> The function get_formal_similarity() took 2.70 seconds to be executed
IGC_step2clean_skinnydf = IGC_step2_clean %>% dplyr::select(-c(task_ID, task_type))
rmarkdown::paged_table(IGC_step2clean_skinnydf, options = list(rows.print = 25, align = "c"))
# remove some values leaving NAs to check that the functions work correctly
IGC_step2_cleanNA = IGC_step2_clean %>%
dplyr::mutate(
Response = dplyr::if_else(dplyr::row_number() == 2, NA_character_, Response),
item = dplyr::if_else(dplyr::row_number() == 3, NA_character_, item)
)
IGC_step3 <- IGC_step2_cleanNA %>%
check_lexicality(item_col = "item", response_col = "Response", criterion = "database") %>%
get_formal_similarity(item_col = "item", response_col = "Response",
attempt_col = "Attempt",
group_cols = c("ID", "item_ID")) %>%
get_semantic_similarity(item_col = "item", response_col = "Response", model = m_w2v)
#> The function check_lexicality() took 4.67 seconds to be executed
#> The function get_formal_similarity() took 6.78 seconds to be executed
#> The function get_semantic_similarity() took 7.95 seconds to be executed
# Compute accessed col
IGC_step3 = IGC_step3 %>%
dplyr::mutate(accessed = dplyr::if_else(Response == item, 1, 0))
IGC_step3_skinnydf = IGC_step3 %>% dplyr::select(-c(item_ID, task_type))
rmarkdown::paged_table(IGC_step3_skinnydf, options = list(rows.print = 25, align = "c"))
Proceed with errors classification
IGC_step4 <- IGC_step3 %>% classify_errors(access_col = "accessed",
RA_col = "RA",
response_col = "Response",
item_col = "item",
also_classify_RAs = T) %>%
dplyr::mutate(general_ID = dplyr::row_number())
#> The function classify_errors() took 0.01 seconds to be executed
View(IGC_step4)
IGC_step4_print_skinny <- IGC_step4 %>%
dplyr::filter(general_ID %in% c(8, 156, 13, 3284, 222, 3448, 5658)) %>% ####AQUI SELECCIONAR UN EJEMPLO DE CADA
dplyr::select(general_ID, ID, item_ID, item, Response, RA,
Attempt, lexicality, cosine_similarity, nonword:no_response, comment) %>%
dplyr::rename(w2v_cos = cosine_similarity, item_ID = item_ID,
attempt = Attempt, response = Response) #%>% dplyr::select(-c(general_ID, no_response))
rmarkdown::paged_table(IGC_step4_print_skinny, options = list(rows.print = 15, align = "c"))
# Convertir el dataframe en un objeto tipo tabla usando tableGrob
table_plot4 <- gridExtra::tableGrob(
IGC_step4_print_skinny %>%
dplyr::slice_head(n = 7) # Seleccionar las primeras 7 filas
)
# Crear un gráfico vacío para agregar la tabla, eliminando los márgenes
plot4 <- ggplot() +
theme_void() + # Eliminar todos los elementos gráficos
annotation_custom(table_plot4)
# Mostrar el gráfico
plot4
ggsave("artwork/figure6.png", plot4, width = 15, height = 3.25, dpi = 600)
ggsave("artwork/figure6.svg", plot4, width = 15, height = 3.25, dpi = 600)